#' Original outlierKD function by By Klodian Dhana,
#' https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/
#' Modified to have third argument for removing outliers instead of interactive prompt,
#' and after removing outlier, original df will not be changed. The function returns the a df,
#' which can be saved as original df name if desired.
#' Also added QQ-plot in the output, with options to show/hide boxplot, histogram, qqplot.
#' Check outliers, and option to remove them, save as a new dataframe.
#' @param df The dataframe.
#' @param var The variable in the dataframe to be checked for outliers
#' @param rm Boolean. Whether to remove outliers or not.
#' @param boxplt Boolean. Whether to show the boxplot, before and after outliers removed.
#' @param histogram Boolean. Whether to show the histogram, before and after outliers removed.
#' @param qqplt Boolean. Whether to show the qqplot, before and after outliers removed.
#' @return The dataframe with outliers replaced by NA if rm==TRUE, or df if nothing changed
#' @examples
#'   outlierKD2(mydf, height, FALSE, TRUE, TRUE, TRUE)
#'   mydf = outlierKD2(mydf, height, TRUE, TRUE, TRUE, TRUE)
#'   mydfnew = outlierKD2(mydf, height, TRUE)
#' @export
outlierKD2 <- function(df, var, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE) {
  dt = df # duplicate the dataframe for potential alteration
  var_name <- eval(substitute(var),eval(dt))
  na1 <- sum(is.na(var_name))
  m1 <- mean(var_name, na.rm = T)
  colTotal <- boxplt+histogram+qqplt
  par(mfrow=c(2, max(2,colTotal)), oma=c(0,0,3,0)) # fixed issue with only 0 or 1 chart selected
  if (qqplt) {
    qqnorm(var_name, main = "With outliers")
    qqline(var_name)
  }
  if (histogram) { hist(var_name, main="With outliers", xlab=NA, ylab=NA) }
  if (boxplt) { boxplot(var_name, main="With outliers") }

  outlier <- boxplot.stats(var_name)$out
  mo <- mean(outlier)
  var_name <- ifelse(var_name %in% outlier, NA, var_name)
  if (qqplt) {
    qqnorm(var_name, main = "Without outliers")
    qqline(var_name)
  }
  if (histogram) { hist(var_name, main="Without outliers", xlab=NA, ylab=NA) }
  if (boxplt) { boxplot(var_name, main="Without outliers") }
  
  if(colTotal > 0) {  # if no charts are wanted, skip this section
    title("Outlier Check", outer=TRUE)
    na2 <- sum(is.na(var_name))
    cat("Outliers identified:", na2 - na1, "\n")
    cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "\n")
    cat("Mean of the outliers:", round(mo, 2), "\n")
    m2 <- mean(var_name, na.rm = T)
    cat("Mean without removing outliers:", round(m1, 2), "\n")
    cat("Mean if we remove outliers:", round(m2, 2), "\n")
  }

  # response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
  # if(response == "y" | response == "yes"){
  if(rm){
      dt[as.character(substitute(var))] <- invisible(var_name)
      #assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
      cat("Outliers successfully removed", "\n")
      return(invisible(dt))
  } else {
      cat("Nothing changed", "\n")
      return(invisible(df))
  }
}

Research Topic:

There is considerable evidence indicating lending disparities throughout the United States. (Steil et. al: 2018). For our research topic, we will explore lending practices in one of the fastest appreciating real estate markets over the past thirty years - the state of California. Specifically, we aim to look at the factors that are associated with denials for non-commercial mortgage loans.

Our SMART question is:

“Which factors drove denials for mortgages in California in 2019?”

To answer this question, we are using the Federal Financial Institutions Examination Council’s (FFIEC) Home Mortgage Disclosure Act (HMDA) dataset from 2019, located here: https://ffiec.cfpb.gov/data-publication/dynamic-national-loan-level-dataset/2019. We are focusing on a subset of data of 10,000 observations from 2019 that we will further filter on California leaving us with 5,196 observations.

Our Github repository address is:https://github.com/brandonchin19/Team3/.

Load in dataset

hmda_ca <- data.frame(read.csv("hmda_ca_new.csv"))
str(hmda_ca)

Check number of rows

dim(hmda_ca)

Check head and tail of dataframe

xkabledplyhead(hmda_ca,5)
xkabledplytail(hmda_ca,5)

Subsetting to California Only; non-business properties; principal residences only

Removed “msa = derived_msa_md,” because of error

hmda_ca <- subset(hmda_ca,business_or_commercial_purpose=="2")
str(hmda_ca)
#rename(hmda_ca, ethnicity=derived_ethnicity, race=derived_race,
       #sex=derived_sex)
hmda_ca <- subset(hmda_ca,business_or_commercial_purpose=="2")
str(hmda_ca)

Subsetting principal residences only; tail and head check to make sure that the geography is widespread/our sample is “random”; there are now 59 distinct counties.

hmda_ca <- subset(hmda_ca,occupancy_type=="1")
dim(hmda_ca) #45735    99
#xkabledplyhead(hmda_ca,5)
#xkabledplytail(hmda_ca,5)
loadPkg("sqldf")
names(hmda_ca)
sqldf("select count(distinct(county_code)) from hmda_ca")
unloadPkg("sqldf")

Subsetting to only relevant actions: denial or approval

hmda_ca1<-hmda_ca%>%filter(action_taken %in% c("1", "3"))
hmda_ca<-hmda_ca1
dim(hmda_ca) #30661    99

Subsetting relevant variables for answering the SMART question

hmda_ca_final <- hmda_ca[c(10,11,12,13,22,24,39,46,50,62,74,78)]
str(hmda_ca_final)

Changing vector types

hmda_ca_final_1 = hmda_ca_final
hmda_ca_final_1$derived_ethnicity = factor(hmda_ca_final$derived_ethnicity)
hmda_ca_final_1$derived_race = factor(hmda_ca_final$derived_race)
hmda_ca_final_1$derived_sex = factor(hmda_ca_final$derived_sex)
#hmda_ca_final_1$action_taken = factor(hmda_ca_final$action_taken)
hmda_ca_final_1$loan_amount = as.numeric(hmda_ca_final$loan_amount)
hmda_ca_final_1$interest_rate = as.numeric(hmda_ca_final$interest_rate)
hmda_ca_final_1$property_value = as.numeric(hmda_ca_final$property_value)
hmda_ca_final_1$income = as.numeric(hmda_ca_final$income)
hmda_ca_final_1$applicant_age = factor(hmda_ca_final$applicant_age)
str(hmda_ca_final_1)

#Examine and filter out the missing values

missvalue <- is.na(hmda_ca_final_1)
summary(missvalue)
#There are 30,661 observations after we've filtered on all of the relevant fields. 
#Missing values are present in interest_rate: 6,919; property_value: 1,734, and income: 1,302.
hmda_ca_final_2 <- hmda_ca_final_1$interest_rate[is.na(hmda_ca_final_1$interest_rate)] <- mean(hmda_ca_final_1$interest_rate, na.rm = TRUE)
hmda_ca_final_2 <- hmda_ca_final_1 %>% drop_na(applicant_ethnicity.1)
hmda_ca_final_2 <- na.omit(hmda_ca_final_1, cols="income")
missvalue1 <- is.na(hmda_ca_final_2)
summary(missvalue1)
#After cleaning out the missing values, we are left with 28,695 observations

VI. Exploratory Data Analysis

#Exploring the categorical values first

library(ggplot2)
ggplot(hmda_ca_final_2, aes(x = factor(derived_sex))) +
    geom_bar(color="black", fill="antiquewhite2")+
  labs(title="Graph 1. Applicant sex distribution", x="Applicant Sex", y="Count")

ggplot(hmda_ca_final_2, aes(x = factor(applicant_age))) +
    geom_bar(color="black", fill="bisque3")+
  labs(title="Graph 2. Applicant age distribution", x="Applicant Age", y="Count")

ggplot(hmda_ca_final_2, aes(x = factor(derived_ethnicity))) +
    geom_bar(color="black", fill="cornsilk3")+
  labs(title="Graph 3. Applicant ethnicity distribution", x="Applicant Ethnicity", y="Count")

ggplot(hmda_ca_final_2, aes(x = factor(action_taken))) +
    geom_bar(color="black", fill="azure3")+
  labs(title="Graph 4. Action taken distribution", x="Action Taken", y="Count")

ggplot(hmda_ca_final_2, aes(x = factor(derived_race))) +
    geom_bar(color="black", fill="azure3")+
  labs(title="Graph 5. Derived Race", x="Race of the Applicant", y= "Count")+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Next, we proceed to exploring the numerical data for normality and outliers. Graphs 8-10 indicate that none of the numerical variables are normally distributed.

ggplot(hmda_ca_final_2,aes(x=loan_amount))+
  geom_histogram(color="black", fill="steelblue")+
  labs(title=" Graph 6. Histogram of Loan Amount", x="Loan Amount in dollars", y="Frequency")+theme_bw()

ggplot(hmda_ca_final_2,aes(x=interest_rate))+
  geom_histogram(color="black", fill="pink")+
  labs(title="Graph 8. Histogram of Interest Rate", x="Interest Rates", y="Frequency")+theme_bw()

ggplot(hmda_ca_final_2,aes(x=income))+
  geom_histogram(color="black", fill="azure3")+
  labs(title="Graph 9. Histogram of Income", x="Income in dollars", y="Frequency")+theme_bw()

ggplot(hmda_ca_final_2,aes(x=property_value))+
  geom_histogram(color="black", fill="lightblue")+
  labs(title="Graph 10. Histogram of Property Values", x="Property Values in dollars", y="Frequency")+theme_bw()

As another check, we inspect qqnorm plots and confirm that the distributions are not normal(Graph 11-14)

qqnorm(hmda_ca_final_2$interest_rate,
       main="Graph 11. QQ Plot of Interest Rates",
       ylab="Interest Rate",
       col="pink")
qqline(hmda_ca_final_2$interest_rate)

qqnorm(hmda_ca_final_2$income,
       main="Graph 13. QQ Plot of Income",
       ylab="Income",
       col="green")
qqline(hmda_ca_final_2$income)

qqnorm(hmda_ca_final_2$property_value,
       main="Graph 12. QQ Plot of Property Values",
       ylab="Property Value",
       col="blue")
qqline(hmda_ca_final_2$property_value)

qqnorm(hmda_ca_final_2$loan_amount,
       main="Graph 14.QQ Plot of Loan Amount",
       ylab="Loan Amount",
       col="purple")
qqline(hmda_ca_final_1$loan_amount)

Next, we remove outliers and check for normality again.

hmda_ca_final_no_outliers_3 <- outlierKD2(hmda_ca_final_2,interest_rate)

hmda_ca_final_no_outliers_4 <- outlierKD2(hmda_ca_final_no_outliers_3,property_value)

hmda_ca_final_no_outliers_5 <- outlierKD2(hmda_ca_final_no_outliers_4,income)

loans <- outlierKD2(hmda_ca_final_no_outliers_5,loan_amount)

We then reexamine the variables for normality after removing the outliers.

qqnorm(loans$interest_rate,
       main="Graph 15. QQ Plot of Interest Rates",
       ylab="Interest Rate",
       col="pink")
qqline(loans$interest_rate)

qqnorm(loans$property_value,
       main="Graph 16. QQ Plot of Property Values",
       ylab="Property Value",
       col="blue")
qqline(loans$property_value)

qqnorm(loans$income,
       main="Graph 17.QQ Plot of Income",
       ylab="Income",
       col="green")
qqline(loans$income)

qqnorm(loans$loan_amount,
       main="Graph 18. QQ Plot of Loan Amount",
       ylab="Loan Amount",
       col="purple")
qqline(loans$loan_amount)

The values do not appear to be normally distributed even after removing the outliers

Numerical_var <- subset(loans,select=c(loan_amount, income, property_value, interest_rate))
#str(Numerical_var)
library(kableExtra)
summary_t<-kbl(summary(Numerical_var))%>%
  kable_styling()
summary_t

# GROUP: what's with the NAs in the summary table after we have cleaned the NAs out?

However, the means and the medians are fairly close in all instances; do we keep removing the outliers?

#this chunk is for visual inspection only: EITHER DELETE or FORMAT before the final submission
boxplot(loans$interest_rate)

boxplot(loans$loan_amount)

boxplot(loans$property_value)

boxplot(loans$income)

Starting the tests

library(lattice)
#dim(loans) #28695    12
pairs(loans)

#str(loans)

library(epiDisplay)
tab1(loans$derived_race, sort.group = "decreasing", cum.percent = TRUE)

##Preparing data for corrplot

cor_loans<-cor(loans_all_num, use="complete.obs")
xkabledply(cor_loans)
loadPkg("corrplot")
corrplot(cor_loans)

Chi-Square Test for Loan Approval and Race: result=>reject the null of independence: all tests REJECT THE NULL HYPOTHESIS OF INDEPENDENCE NOW THAT WE HAVE MORE DATA (Applicant age is the only issue)

contable1 = table(loans$derived_race, loans$action_taken)
xkabledply(contable1, title="Contingency table for Loan Approval and Race")
Contingency table for Loan Approval and Race
1 3
2 or more minority races 75 52
American Indian or Alaska Native 170 88
Asian 2672 824
Black or African American 1242 663
Joint 1062 287
Native Hawaiian or Other Pacific Islander 158 85
Race Not Available 3287 1008
White 14107 2915
chitest1 = chisq.test(contable1)
chitest1
## 
##  Pearson's Chi-squared test
## 
## data:  contable1
## X-squared = 492, df = 7, p-value <2e-16
contable2 = table(loans$derived_sex, loans$action_taken)
xkabledply(contable2, title="Contingency table for Loan Approval and Sex")
Contingency table for Loan Approval and Sex
1 3
Female 4639 1346
Joint 10354 2065
Male 6771 2161
Sex Not Available 1009 350
chitest2 = chisq.test(contable2)
chitest2
## 
##  Pearson's Chi-squared test
## 
## data:  contable2
## X-squared = 225, df = 3, p-value <2e-16
contable3 = table(loans$derived_ethnicity, loans$action_taken)
xkabledply(contable3, title="Contingency table for Loan Approval and Ethnicity")
Contingency table for Loan Approval and Ethnicity
1 3
Ethnicity Not Available 2891 830
Hispanic or Latino 4145 1291
Joint 1229 285
Not Hispanic or Latino 14508 3516
chitest3 = chisq.test(contable3)
chitest3
## 
##  Pearson's Chi-squared test
## 
## data:  contable3
## X-squared = 56, df = 3, p-value = 5e-12
contable4 = table(loans$applicant_age, loans$action_taken)
xkabledply(contable4, title="Contingency table for Loan Approval and Age")
Contingency table for Loan Approval and Age
1 3
<25 253 58
>74 1621 373
25-34 3897 823
35-44 5606 1417
45-54 4924 1382
55-64 3674 1089
65-74 2798 779
8888 0 1
chitest4 = chisq.test(contable4)
chitest4
## 
##  Pearson's Chi-squared test
## 
## data:  contable4
## X-squared = 63, df = 7, p-value = 4e-11

#adding New Code for final and downsampling

hmda <- data.frame(loans)
str(hmda$action_taken)


hmda$approval<- ifelse(hmda$action_taken=="1", "Approved","Denied")
hmda$approval<- factor(hmda$approval)
str(hmda)
table(hmda$approval)

#Approved   Denied 
   #22773     5922 

Checking frequency of approval variable and derived_race variable

with(hmda,
     {
       print(table(derived_race))
       print(table(approval))
     }
)

Creating DownSample of hmda dataset, the Approved far outnumber the Denied making this dataset severely unbalanced #Approved: 22773 Denied: 5922

Approved<- which(hmda$action_taken=="1")
Denied<- which(hmda$action_taken=="3")

length(Approved)
length(Denied)

approved_downsample<-sample(Approved,length(Denied))
hmda_down<- hmda[c(approved_downsample, Denied),]
str(hmda_down)
hmda_downnum<-hmda_down[c(4:11, 13:17, 19)]
hmda_downnum$approved<- ifelse(hmda_downnum$action_taken=="1", 1, 0)
hmda_downnum$denied<- ifelse(hmda_downnum$action_taken=="3", 1, 0)

str(hmda_downnum)
cor_loans<-cor(hmda_downnum, use="complete.obs")
xkabledply(cor_loans)
loadPkg("corrplot")
corrplot(cor_loans)

Conducting Chi-squared Test on the downsampled dataset (hmda_down)

P-Value for contable1 is lower than .05 therefore we reject the null hypothesis. action_taken and derived_race are not independent of each other

contable1= table(hmda_down$derived_race, hmda_down$action_taken)
xkabledply(contable1, title="Contingency table for Loan Approval and Race")
chitest1 = chisq.test(contable1)
chitest1
contable2= table(hmda_down$derived_ethnicity, hmda_down$action_taken)
xkabledply(contable2, title="Contingency table for Loan Approval and Ethnicity")
chitest2 = chisq.test(contable2)
chitest2
contable3= table(hmda_down$derived_sex, hmda_down$action_taken)
xkabledply(contable3, title="Contingency table for Loan Approval and Gender")
chitest3 = chisq.test(contable3)
chitest3
contable4= table(hmda_down$applicant_age, hmda_down$action_taken)
xkabledply(contable4, title="Contingency table for Loan Approval and Age")
chitest4 = chisq.test(contable4)
chitest4

Full linear model for action_taken

full_model <- lm(action_taken~., data = loans)
summary(full_model)

Removing specific race fields due to NAs, running model again

loans_for_modelling <- loans[1:12]
full_model_2 <- lm(action_taken~., data = loans_for_modelling)
summary(full_model_2)

#{r} #pcr.fit=pcr(action_taken~.,data=loans_for_modelling,scale=FALSE,validation ="CV") #ezids::xkabledplyhead(loans_for_modelling) #summary(pcr.fit) #

str(loans_for_modelling)

Linear models for action_taken

action_taken_fit1 <- lm(action_taken ~ derived_race, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit1)
xkablevif(action_taken_fit1)
action_taken_fit2 <- lm(action_taken ~ derived_race + derived_ethnicity, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit2)
action_taken_fit3 <- lm(action_taken ~ derived_race + derived_ethnicity + derived_sex, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit3)
xkablevif(action_taken_fit3)
action_taken_fit4 <- lm(action_taken ~ derived_race + derived_ethnicity + derived_sex + applicant_age, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit4)
xkablevif(action_taken_fit4)

The multiple R^2s increase with each successive model, indicating slight improvements from action_taken_fit1 through action_taken_fit4.

ANOVA comparison of models

anova(action_taken_fit1,action_taken_fit2,action_taken_fit3,action_taken_fit4) -> anovaRes
str(anovaRes)
xkabledply(anovaRes, title = "ANOVA comparison between the models")

The residuals decrease with each successive linear model. This suggests that action_taken_fit4 is a better regression model for explaining this data.

Changing action_taken to factor for logit

loans_for_modelling = loans_for_modelling
loans_for_modelling$action_taken = factor(loans_for_modelling$action_taken)
hmda_down = hmda_down
hmda_down$action_taken = factor(hmda_down$action_taken)

Logit models for action_taken

approval_logit <- glm(action_taken ~ derived_race + derived_ethnicity + derived_sex + loan_amount + income, data = hmda_down, family = "binomial")
summary(approval_logit)
xkablevif(approval_logit)
approval_logit_1 <- glm(action_taken ~ derived_race, data = hmda_down, family = "binomial")
summary(approval_logit_1)
xkablevif(approval_logit_1)

Sample predictions

  1. What are the chances of loan approval for a Black, non-Latino woman with an income of 45,000 per year for a requested loan amount of 100,000?
  2. What are the chances of loan approval for a Black borrower?
  3. What are the chances of loan approval for a white borrower?
#define new observations
newdata1 = data.frame(derived_race = 'Black or African American', derived_sex = 'Female', loan_amount = 100000, derived_ethnicity = 'Not Hispanic or Latino', income = 45000)
newdata2 = data.frame(derived_race = 'Black or African American')
newdata3 = data.frame(derived_race = 'White')


newdata1 = predict(approval_logit, newdata1, type="response")
newdata2 = predict(approval_logit_1, newdata2, type="response")
newdata3 = predict(approval_logit_1, newdata3, type="response")
newdata1
newdata2
newdata3
  1. Close to 0% (model too complicated/problem with those numerical values just not being present in the data?)
  2. 68.1%
  3. 44%
summary(hmda_down$action_taken)

#SideNotes on ANOVA Testing These are some constructs of my ANOVA test I left them here just incase I need to revist them, these might not be perfect because I ran multiple different variants, I assume it will not be used ultimately and if we need too we can run chi-squared on the balanced dataset.

Conducting ANOVA Test on action_taken variable and income, saved test as anova

anova= aov(income ~ approval, data=hmda) str(anova)

#Plotting ANOVA TEST

loadPkg(“ggplot2”) ggplot(hmda, aes(x=approval, y=income)) + geom_boxplot( colour=c(“#ff0000” ,“#11cc11”)) + labs(title=“Income difference between Approved and Denied Apps”, x=“Approval Status”, y = “Income”)

#plot(income ~ action_taken, data=hmda) anova= aov(action_taken ~ Black, data=hmda) # anovaRes # this does not give the easy-to-read result of the aov analysis names(anova) # summary(anovaRes) xkabledply(anova) # same exact result with or without re-ordering.